Link to notebook
Link to github repo.
library(tidyverse)
library(readxl)
library(phyloseq)
library(dada2)
library(Biostrings)
library(DECIPHER)
library(phangorn)
library(readr)
library(seqinr)
library(decontam)
library(ape)
library(vegan)
#library(philr)
library(RColorBrewer)
library(microbiome)
library(DESeq2)
library(compositions);
library(cowplot)
library(plotly)
library(htmlwidgets)
library(withr)
metadata <- read_csv("sample_data.csv")
[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────[39m
cols(
SampleID = [31mcol_character()[39m,
`Year.Trawl#` = [31mcol_character()[39m,
Datecode = [32mcol_double()[39m,
Date = [31mcol_character()[39m,
Month = [32mcol_double()[39m,
Year = [32mcol_double()[39m,
Bayside = [31mcol_character()[39m,
Station = [31mcol_character()[39m
)
Import count table and taxonomy file. I slightly modified otutable.csv in Excel to otutable_mod.csv to remove the quotes around seq names and put NA placehoder as first col name (which was above row names)
# Import Count table. Skip first row of tsv file, which is just some text
count_table <- read_table2("results/otutable_mod.csv")
Missing column names filled in: 'X1' [1]
[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
X1 = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
colnames(count_table)[1] <- "SampleID"
# Import taxonomy of ASVs
taxonomy <- read_csv(file="results/tax_sequences_blast_taxonomy.csv")
Missing column names filled in: 'X1' [1]Duplicated column names deduplicated: 'RefSeq_Tax_ID' => 'RefSeq_Tax_ID_1' [18]
[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────[39m
cols(
X1 = [32mcol_double()[39m,
ASV_ID = [31mcol_character()[39m,
ref_seq_ID = [31mcol_character()[39m,
PID = [32mcol_double()[39m,
alnmt_len = [32mcol_double()[39m,
mismatch = [32mcol_double()[39m,
eval = [32mcol_double()[39m,
bscore = [32mcol_double()[39m,
RefSeq_Tax_ID = [32mcol_double()[39m,
Ref_Seq_title = [31mcol_character()[39m,
superkingdom = [31mcol_character()[39m,
phylum = [31mcol_character()[39m,
class = [31mcol_character()[39m,
order = [31mcol_character()[39m,
family = [31mcol_character()[39m,
genus = [31mcol_character()[39m,
species = [31mcol_character()[39m,
RefSeq_Tax_ID_1 = [32mcol_double()[39m
)
# remove first col of sequential numbers
taxonomy[,1] <- NULL
# filter out sequences with low PID (recommended by Sara)
taxonomy <- filter(taxonomy, PID > 92)
# remove BLAST metadata and just retain taxonomy (necessary for further processing below)
drop.cols <- c(colnames(taxonomy)[2:9],'RefSeq_Tax_ID_1')
taxonomy <- select(taxonomy, -one_of(drop.cols))
# And import the Common names, as curated by Sara. Join to taxonomy
commonnames <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",7)
commonnames
taxonomy <- left_join(taxonomy, commonnames, by = "ASV_ID")
taxonomy
NA
Filtering removed seqs 110, 332 (Gobiosoma ginsburgi and Belone belone) Note for Sara should we consider setting this at 97% which is more robust and still leaves 334 unique ASVs (rather than 379 with the 92% cutoff in the settings above)
Preview datasets
count_table
taxonomy
metadata
I want to use the phyloseq package for some plotting/ statistics, which first requires making phyloseq objects out of each of input data tables-
count_table_matrix <- as.matrix(count_table[,2:392]) # convert count table to matrix, leaving out character column of sample ID
rownames(count_table_matrix) <- count_table$SampleID # add back in Sample IDs as row names
ASV = otu_table(count_table_matrix, taxa_are_rows = FALSE)
taxonomy_matrix <- as.matrix(taxonomy[,2:9])
rownames(taxonomy_matrix) <- taxonomy$ASV_ID
TAX = tax_table(taxonomy_matrix)
META = sample_data(data.frame(metadata, row.names = metadata$`SampleID`))
First check that the inputs are in compatible formats by checking for ASV names with the phyloseq function, taxa_names
head(taxa_names(TAX))
[1] "Seq_1" "Seq_2" "Seq_3" "Seq_4" "Seq_5" "Seq_6"
head(taxa_names(ASV))
[1] "Seq_1" "Seq_2" "Seq_3" "Seq_4" "Seq_5" "Seq_6"
And check sample names were also detected
# Modify taxa names in ASV, which are formatted with the sample ID, underscor, fastq ID. Don't need this fastq ID anymore and want it to match the sample names from metadata
sample_names(ASV) <- sample_names(ASV) %>%
str_replace_all(pattern = "_S[:digit:]+",replacement = "")
head(sample_names(ASV))
[1] "T1PosCon" "T1S10" "T1S11" "T1S1" "T1S2" "T1S3"
head(sample_names(META))
[1] "T1PosCon" "T1S1" "T1S2" "T1S3" "T1S5" "T1S6"
And make the phyloseq object
ps <- phyloseq(ASV, TAX, META)
rarecurve(otu_table(ps), step=50, cex=0.5)
empty rows removed
# save as .eps
setEPS()
postscript("Figures/rarefaction.eps")
rarecurve(otu_table(ps), step=50, cex=0.5)
empty rows removed
dev.off()
quartz_off_screen
2
Most samples look like they were sampled to completion. Be weary of T3S11, T1S2, and maybe T4S5
Check some features of the phyloseq object
rank_names(ps)
[1] "superkingdom" "phylum" "class" "order" "family" "genus" "species"
[8] "CommonName"
unique(tax_table(ps)[, "superkingdom"])
Taxonomy Table: [2 taxa by 1 taxonomic ranks]:
superkingdom
Seq_1 "Eukaryota"
Seq_377 NA
unique(tax_table(ps)[, "phylum"])
Taxonomy Table: [3 taxa by 1 taxonomic ranks]:
phylum
Seq_1 "Chordata"
Seq_368 "Arthropoda"
Seq_377 NA
unique(tax_table(ps)[, "class"])
Taxonomy Table: [5 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_63 "Mammalia"
Seq_362 "Chondrichthyes"
Seq_368 "Insecta"
Seq_377 NA
There are some ASVs with NA as superkingdom, phylum, or class annotation- delete these.
ps <- subset_taxa(ps, !is.na(superkingdom) & !is.na(phylum) & !is.na(class))
unique(tax_table(ps)[, "superkingdom"])
Taxonomy Table: [1 taxa by 1 taxonomic ranks]:
superkingdom
Seq_1 "Eukaryota"
unique(tax_table(ps)[, "phylum"])
Taxonomy Table: [2 taxa by 1 taxonomic ranks]:
phylum
Seq_1 "Chordata"
Seq_368 "Arthropoda"
unique(tax_table(ps)[, "class"])
Taxonomy Table: [4 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_63 "Mammalia"
Seq_362 "Chondrichthyes"
Seq_368 "Insecta"
nrow(tax_table(ps)) # number of ASVs left
[1] 378
378 ASVs still remain…
Also check class Mammalia, to see if contamination or real:
tax_table(subset_taxa(ps, class == 'Mammalia'))
Taxonomy Table: [8 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family genus species CommonName
Seq_63 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_88 "Eukaryota" "Chordata" "Mammalia" "Artiodactyla" "Suidae" "Sus" "Sus scrofa" "Wild boar"
Seq_157 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_343 "Eukaryota" "Chordata" "Mammalia" "Carnivora" "Felidae" "Felis" "Felis catus" "Cat"
Seq_369 "Eukaryota" "Chordata" "Mammalia" "Artiodactyla" "Bovidae" "Bos" "Bos taurus" "Cattle"
Seq_378 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_383 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_389 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
These are human, wild boar, cat (…cat lady), and cattle. All are contamination so delete all Mammalia
ps <- subset_taxa(ps, !class == 'Mammalia')
unique(tax_table(ps)[, "class"])
Taxonomy Table: [3 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_362 "Chondrichthyes"
Seq_368 "Insecta"
Next check the “Insecta” entries
tax_table(subset_taxa(ps, class == 'Insecta'))
Taxonomy Table: [2 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family genus species
Seq_368 "Eukaryota" "Arthropoda" "Insecta" "Hymenoptera" "Formicidae" "Linepithema" "Linepithema humile"
Seq_380 "Eukaryota" "Arthropoda" "Insecta" "Hymenoptera" "Formicidae" "Linepithema" "Linepithema humile"
CommonName
Seq_368 "Ant"
Seq_380 "Ant"
The onlly Insecta is Linepithema humile, which are ants so delete these too..
ps <- subset_taxa(ps, !class == 'Insecta')
unique(tax_table(ps)[, "class"])
Taxonomy Table: [2 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_362 "Chondrichthyes"
Check overall how the phyla are distributed among samples
# First aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
superkingdomGlommed = tax_glom(ps, "superkingdom")
# and plot
plot_bar(superkingdomGlommed, x = "Sample")
ggsave(filename = "Figures/seqdepth.eps", plot = plot_bar(superkingdomGlommed, x = "Sample"), units = c("in"), width = 9, height = 6, dpi = 300, )# and save
Total sequences reveals certain samples had very low sequencing effort: T1S7, T1S8, T3S11, and, not as bad, T1S2 and T4S5
The rarefaction analysis also showed T1S2 and T4S5 samples were likely not sequenced to completion. Therefore remove these 5 samples from analysis
ps <- subset_samples(ps, !SampleID == "T1S7" & !SampleID == "T1S8" & !SampleID == "T3S11" & !SampleID == "T1S2" & !SampleID == "T4S5")
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 368 taxa and 50 samples ]
sample_data() Sample Data: [ 50 samples by 8 sample variables ]
tax_table() Taxonomy Table: [ 368 taxa by 8 taxonomic ranks ]
50 samples remaining with 368 ASVs
Remove Pos Controls (all hits in positive controls are the same family- I assume this is expected)
ps <- subset_samples(ps, !SampleID == "T1PosCon" & !SampleID == "T2PosCon" & !SampleID == "T3PosCon")
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 368 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 8 sample variables ]
tax_table() Taxonomy Table: [ 368 taxa by 8 taxonomic ranks ]
And lastly, correct some taxonomy: According to Sara, Engraulis encrasicolus (European anchovy) should be Anchoa mitchilli (Bay anchovy):
tax_table(ps) <- gsub(tax_table(ps), pattern = "Engraulis encrasicolus", replacement = "Anchoa mitchilli")
47 samples remainwith 368 unique ASVs
For plotting, use relative abundances (# of ASV sequences/sum total sequences in sample), calculated easily using microbiome::transform
ps_ra <- microbiome::transform(ps, transform = "compositional")
Export the relative abundance matrix so Sara can have it:
# Extract abundance matrix from the phyloseq object
RelAbun_matrix = as(otu_table(ps_ra), "matrix")
# Coerce to data.frame
RelAbun_dataframe = as.data.frame(RelAbun_matrix)
# Export
write.csv(RelAbun_dataframe,"results/otutable_relabun.csv", row.names = TRUE)
Then aglomerate the ASVs at the family level using the phyloseq function, tax_glom
familyGlommed_RA = tax_glom(ps_ra, "family")
family_barplot <- plot_bar(familyGlommed_RA, x = "Sample", fill = "family")
family_barplot
NOTES for Sara
Agglomerate by species to see if I get the same 38 unique species Sara sees:
speciesGlommed_RA = tax_glom(ps_ra, "CommonName")
speciesGlommed_RA
phyloseq-class experiment-level object
otu_table() OTU Table: [ 43 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 8 sample variables ]
tax_table() Taxonomy Table: [ 43 taxa by 8 taxonomic ranks ]
tax_table(speciesGlommed_RA)
Taxonomy Table: [43 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family
Seq_1 "Eukaryota" "Chordata" "Actinopteri" "Atheriniformes" "Atherinopsidae"
Seq_2 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Clupeidae"
Seq_3 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Engraulidae"
Seq_4 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Pomatomidae"
Seq_5 "Eukaryota" "Chordata" "Actinopteri" "Lutjaniformes" "Lutjanidae"
Seq_6 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Paralichthyidae"
Seq_7 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Clupeidae"
Seq_9 "Eukaryota" "Chordata" "Actinopteri" "Gobiiformes" "Gobiidae"
Seq_10 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Scophthalmidae"
Seq_11 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Serranidae"
Seq_12 "Eukaryota" "Chordata" "Actinopteri" "Spariformes" "Sparidae"
Seq_15 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae"
Seq_16 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae"
Seq_17 "Eukaryota" "Chordata" "Actinopteri" "Labriformes" "Labridae"
Seq_19 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Cottidae"
Seq_20 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Pleuronectidae"
Seq_21 "Eukaryota" "Chordata" "Actinopteri" NA "Moronidae"
Seq_22 "Eukaryota" "Chordata" "Actinopteri" "Syngnathiformes" "Syngnathidae"
Seq_30 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Paralichthyidae"
Seq_33 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae"
Seq_34 "Eukaryota" "Chordata" "Actinopteri" "Labriformes" "Labridae"
Seq_36 "Eukaryota" "Chordata" "Actinopteri" "Anguilliformes" "Anguillidae"
Seq_38 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae"
Seq_40 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Gasterosteidae"
Seq_44 "Eukaryota" "Chordata" "Actinopteri" "Cyprinodontiformes" "Fundulidae"
Seq_50 "Eukaryota" "Chordata" "Actinopteri" "Atheriniformes" "Atherinopsidae"
Seq_52 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Phycidae"
Seq_54 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae"
Seq_57 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Triglidae"
Seq_67 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae"
Seq_82 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae"
Seq_84 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Gadidae"
Seq_102 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Engraulidae"
Seq_103 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Cottidae"
Seq_115 "Eukaryota" "Chordata" "Actinopteri" "Cyprinodontiformes" "Fundulidae"
Seq_119 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Phycidae"
Seq_139 "Eukaryota" "Chordata" "Actinopteri" "Batrachoidiformes" "Batrachoididae"
Seq_141 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae"
Seq_181 "Eukaryota" "Chordata" "Actinopteri" "Tetraodontiformes" "Tetraodontidae"
Seq_231 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Merlucciidae"
Seq_359 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Triglidae"
Seq_362 "Eukaryota" "Chordata" "Chondrichthyes" "Myliobatiformes" "Myliobatidae"
Seq_372 "Eukaryota" "Chordata" "Chondrichthyes" "Carcharhiniformes" "Triakidae"
genus species CommonName
Seq_1 "Menidia" "Menidia menidia" "Atlantic silverside"
Seq_2 "Brevoortia" "Brevoortia tyrannus" "Atlantic menhaden"
Seq_3 "Engraulis" "Anchoa mitchilli" "Bay anchovy"
Seq_4 "Pomatomus" "Pomatomus saltatrix" "Bluefish"
Seq_5 "Lutjanus" "Lutjanus griseus" "Grey snapper"
Seq_6 "Paralichthys" "Paralichthys dentatus" "Summer flounder"
Seq_7 "Alosa" "Alosa mediocris" "Hickory shad"
Seq_9 "Gobiosoma" "Gobiosoma ginsburgi" "Seaboard goby"
Seq_10 "Scophthalmus" "Scophthalmus aquosus" "Windowpane flounder"
Seq_11 "Centropristis" "Centropristis striata" "Black seabass"
Seq_12 "Stenotomus" "Stenotomus chrysops" "Scup"
Seq_15 "Leiostomus" "Leiostomus xanthurus" "Spot"
Seq_16 "Menticirrhus" "Menticirrhus saxatilis" "Northern kingfish"
Seq_17 "Tautoga" "Tautoga onitis" "Tautog"
Seq_19 "Myoxocephalus" "Myoxocephalus aenaeus" "Grubby sculpin"
Seq_20 "Pseudopleuronectes" "Pseudopleuronectes americanus" "Winter flounder"
Seq_21 "Morone" "Morone saxatilis" "Striped bass"
Seq_22 "Syngnathus" "Syngnathus fuscus" "Northern pipefish"
Seq_30 "Etropus" "Etropus microstomus" "Smallmouth flounder"
Seq_33 "Cynoscion" "Cynoscion regalis" "Weakfish"
Seq_34 "Tautogolabrus" "Tautogolabrus adspersus" "Cunner"
Seq_36 "Anguilla" "Anguilla rostrata" "American eel"
Seq_38 "Thunnus" "Thunnus obesus" "Bigeye tuna"
Seq_40 "Apeltes" "Apeltes quadracus" "Fourspine stickleback"
Seq_44 "Fundulus" "Fundulus majalis" "Striped killifish"
Seq_50 "Membras" "Membras martinica" "Rough silverside"
Seq_52 "Urophycis" "Urophycis floridana" "Spotted hake"
Seq_54 "Scomber" "Scomber japonicus" "Chub mackerel"
Seq_57 "Prionotus" "Prionotus carolinus" "Northern searobin"
Seq_67 "Thunnus" "Thunnus thynnus" "Atlantic bluefin tuna"
Seq_82 "Bairdiella" "Bairdiella chrysoura" "American silver perch"
Seq_84 "Microgadus" "Microgadus tomcod" "Atlantic tomcod"
Seq_102 "Engraulis" "Engraulis mordax" "Bay anchovy"
Seq_103 "Myoxocephalus" "Myoxocephalus quadricornis" "Fourhorn sculpin"
Seq_115 "Fundulus" "Fundulus heteroclitus" "Mummichog"
Seq_119 "Urophycis" "Urophycis floridana" "Red hake"
Seq_139 "Opsanus" "Opsanus tau" "Oyster toadfish"
Seq_141 "Katsuwonus" "Katsuwonus pelamis" "Skipjack tuna"
Seq_181 "Sphoeroides" "Sphoeroides maculatus" "Northern puffer"
Seq_231 "Merluccius" "Merluccius bilinearis" "Silver hake"
Seq_359 "Prionotus" "Prionotus evolans" "Striped searobin"
Seq_362 "Rhinoptera" "Rhinoptera bonasus" "Cownose ray"
Seq_372 "Mustelus" "Mustelus canis" "Dusky smooth-hound shark"
NOTES for Sara
Based on my previous scripts with Cariaco Eukaryotic data
# convert ps object to dataframe using phyloseq's psmelt
species_df <- psmelt(speciesGlommed_RA)
# replace zeroes in the table with NA
species_df[species_df == 0] <- NA
# and remove rows with NAs in abundance (this is so they don't appear as small dots in plot)
species_df <- filter(species_df, !is.na(Abundance))
Plot by species, scientific name
bubbleplot_eDNA_sciname <- ggplot(species_df, aes(x = Station, y = fct_rev(species), color = Station)) + # the fancy stuff around y (species) helps to present it in reverse order in the plot (from top to btm alphabetically)
geom_point(aes(size = Abundance, fill = Station), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6)+
xlab("")+
ylab("")+
labs(size="Relative Abundance")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(Datecode~Bayside, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
bubbleplot_eDNA_sciname
Plot by species common name
Exportfigures
ggsave(filename = "Figures/speciesbubbleplot_eDNA_sciname.eps", plot = speciesbubbleplot_eDNA_sciname, units = c("in"), width = 7, height = 12, dpi = 300)
ggsave(filename = "Figures/speciesbubbleplot_eDNA_comname.eps", plot = speciesbubbleplot_eDNA_comname, units = c("in"), width = 7, height = 12, dpi = 300, )
NOTE on above. The common name plot has two entries in the Bay anchovy row because, as mentioned above, there are two different species name that are labelled as Bay Anchovy. Is it OK to group these as same species (Anchoa mitchilli)
NEXT next Sara wants the average rel abundances for each species across all dates
# import 4th sheet from Excel file which contains morphometric data for each individual collected for every date
trawl_master <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",4)
trawl_master
# and import 6th sheet which is station info
stations <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",6)
stations
NA
Make an equivalent to an OTU table, grouping by date and location and representing counts for every unique species
trawl_counts <- trawl_master %>%
group_by(DATECODE, STATION_NO, CommonName) %>%
tally(name = "count")
trawl_counts
and link station names instead of numbers to count table
trawl_counts <- left_join(trawl_counts, stations, by = "STATION_NO")
trawl_counts
speciesbubbleplot_trawl_comname <- ggplot(trawl_counts, aes(x = STATION_NA, y = fct_rev(CommonName), color = STATION_NA)) +
geom_point(aes(size = log10(count), fill = STATION_NA), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(.01,.1, .3, .5, 1, 3), max_size = 6)+
xlab("")+
ylab("")+
labs(size="Log(counts)", fill = "Station")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(DATECODE~BAYSIDE, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
speciesbubbleplot_trawl_comname
Export figure
ggsave(filename = "Figures/speciesbubbleplot_trawl_comname.eps", plot = speciesbubbleplot_trawl_comname, units = c("in"), width = 6.75, height = 15, dpi = 300)
PCA is essentially a type of PCoA using the Euclidean distance matrix as input. When combined with a log-ratio transformation of the count table, this is deemed appropriate for compositional datasets. It is also recommended as a first step in exploratory analyses.
First do a CLR, centered log ratio transformation of the absolute abundance data (after filtering), as suggested by Gloor et al. 2017
# Estimate covariance matrix for CLR-transformed ASV table
clr_asv_table_ps <- data.frame(compositions::clr(otu_table(ps)))
Generate the PCA and visualize axes
# Generate a Principle Component Analysis (PCA) and evaluated based on the eigen decomposition from sample covariance matrix.
lograt_pca <- prcomp(clr_asv_table_ps)
# NOTE- this is equivalent to first making a Euclidean distance matrix using the CLR data table and then running a PCoA. A Euclidean distance matrix of a log-transformed data table = an Aitchison distance matrix. So this is equivalent to the compositional methods listed in Gloor et al.
# Visual representation with a screeplot
lograt_variances <- as.data.frame(lograt_pca$sdev^2/sum(lograt_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'lograt_pca$sdev^2/sum(lograt_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(lograt_variances)
# Plot screeplot
ggplot(lograt_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Ratio PCA Screeplot, CLR Tranformation")
A faithful representation is to plot this ordination in 2D because the 3rd and 4th axes explain very low % of variances, while the 1st and 2nd explain a decently large proportion of variance (15.7 + 10.5 = 26.2%)
Visualize the PCA-
lograt_pca$x #View PC values
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
T1S10 -2.6238954 -0.3252536 1.27005712 -1.666211452 0.798649968 -0.57848189 0.247580998 -0.558711581 1.54051341 -1.00257763
T1S11 0.2705609 -4.0749320 -6.66931720 2.284305153 2.672326913 -0.41018048 -0.526654868 -0.462324083 -0.41629925 -1.22992687
T1S1 -2.7027302 2.5315244 0.42770674 -2.891886434 -0.089199476 -2.02741275 -1.526307265 4.507918543 -2.80161601 0.99281647
T1S3 -2.1044928 1.0756380 1.49045892 -2.219957763 -0.061377153 -0.38494915 0.320469272 -1.012257490 1.42749170 -0.65295542
T1S5 -1.7135917 1.1790076 2.77474051 -1.340062932 -1.364232051 0.24102503 -0.009868407 -0.958600452 1.66240096 1.90249139
T1S6 -1.6263823 -0.7382611 4.92533402 -2.100496346 -0.992855481 1.31005659 2.080679436 -0.291113329 -0.53023688 -0.48181295
T1S9 -1.8890736 -7.1882980 0.74791280 0.478128606 -1.223416026 -1.18521711 2.834417355 1.011107634 -1.91733560 -1.95236999
T2S10 -5.4723849 -3.7102032 -2.02733221 0.358739428 0.095792276 -4.19640555 1.696323226 0.708131115 1.11063176 -1.77725044
T2S11 4.0600493 -1.6653347 0.95792661 -1.382521282 -1.935788288 0.14614516 -0.219686405 0.066454230 0.28910892 -0.35381697
T2S1 -0.5256139 -1.6043596 -2.82111238 0.471370176 5.396921155 4.13619608 0.992107496 -1.578763421 1.80377493 2.18025904
T2S2 -1.5675813 3.4039386 -2.83994788 -1.328904319 1.293026359 3.90586508 4.611348013 -0.711643119 -2.75478280 1.19354776
T2S3 -2.4949331 1.8527466 0.55242812 -2.302859249 0.001024354 -1.64228999 -0.599524728 -0.235068104 0.87739077 -0.06120956
T2S4 -5.4192303 -0.5712951 -2.80407160 0.328311261 -0.473384127 3.55256896 -0.864999520 3.421745399 -2.94070397 -1.64572626
T2S5 -2.2024924 2.1002160 2.34821607 2.710381042 3.191568957 -0.90891012 -3.088217336 3.454337657 0.47494384 2.76967314
T2S6 -1.8637965 -5.0301644 2.83044061 0.651949689 2.536239781 2.28530167 -1.412309984 2.720676493 2.49919366 1.43936846
T2S9 -8.1183606 -4.0795361 -5.89618438 3.088417424 -6.620641957 0.18089576 0.254590155 -0.882108849 1.90441468 4.35599519
T3S10 4.6214005 -1.6575522 0.58946754 -1.320385118 -1.661933332 0.18164541 -0.035201214 0.001341068 0.26754996 -0.43305903
T3S1 -2.3913351 2.2896417 -0.20957141 -1.068738089 -0.640139573 0.63357839 -1.285553135 -1.195502284 -0.53495156 -0.32737072
T3S2 -2.2678012 3.3894600 0.27244730 2.244919808 0.035628741 0.18047505 -3.537238882 -0.264238101 -0.35901011 0.87134641
T3S3 -0.6141891 1.8274067 -1.15847880 -1.437042040 0.895015921 0.69099724 2.030728395 -1.239263664 -0.42837700 0.42179666
T3S4 -3.4730868 0.5900791 0.55663668 -1.944078098 1.248042434 -0.03954649 0.588369559 -0.373556187 3.33127230 -1.17587526
PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20
T1S10 0.22597717 -0.49591596 0.62088813 -0.11135149 -0.90317596 0.052175005 0.51636597 0.11646915 -1.13617561 -0.939740199
T1S11 -0.33875912 -2.47937788 -1.92292360 -1.61727672 1.42629776 2.633399673 -1.77511843 -0.64393857 -0.16379258 -1.170843742
T1S1 0.42101859 0.65442210 0.05542544 1.05706121 1.01377542 1.748809810 0.60333600 -2.11186111 -3.09626558 1.781651311
T1S3 0.21932314 -0.65001251 0.19899498 -0.42992913 -0.17187489 -0.321107836 -0.29559955 -0.62907644 -0.98165385 -0.420964761
T1S5 0.90236276 2.37041721 -0.90409728 -5.95846889 -0.99039907 2.323748019 1.87406642 -0.52235594 0.47615803 -0.538955197
T1S6 0.32492687 -0.58250914 -3.36962676 1.03850812 2.92510401 0.527322477 0.74283517 1.98271154 -0.90769777 0.208257905
T1S9 -1.82706149 4.63700830 -1.41096275 1.65684273 -0.73128430 -0.439014716 0.22456025 -2.83124825 1.54026880 -1.804244119
T2S10 -0.63570055 -0.04821840 0.03027849 -2.11585154 1.52888813 -1.711272217 -2.05352206 -0.32604955 -0.43090515 1.805775529
T2S11 0.10750821 -0.16232384 0.22061834 0.06591853 -0.35834169 -0.180707353 -0.42978954 0.20962610 -0.45129627 -0.074319773
T2S1 -0.84361171 2.61168296 -1.64637670 0.47643508 -1.53112443 -0.783075685 1.14544456 0.43919900 -1.03784578 1.943550414
T2S2 1.90947454 -0.88042217 2.50315503 -0.71782729 2.65945940 -0.679764282 0.20848879 -1.61984898 0.76279443 -1.652676887
T2S3 -0.28832020 -0.58979394 0.08413418 -0.03476737 0.53372457 -0.528477113 -0.10571160 -0.13196257 -0.50579456 -0.513744834
T2S4 -1.73341212 -0.05444488 2.67180984 -1.40242616 -0.29205849 0.287767338 0.90589382 2.03117571 -0.08582432 0.517562229
T2S5 2.20888695 1.28719261 0.16320560 0.97886685 -0.43996377 -0.185431085 0.30838611 -0.78176830 0.76544762 -0.767696679
T2S6 -2.05629428 -1.67779276 0.06090643 -0.37825238 1.14814511 0.229027684 -1.04539827 -0.09822133 0.79124945 -0.201171927
T2S9 2.52542213 -1.21424602 -0.33127400 1.97232751 -0.63646116 0.587380408 0.56256483 0.20782020 0.12040843 0.385932778
T3S10 0.17608809 -0.09239084 0.27696903 0.01189034 -0.04774737 -0.240905695 -0.07265447 0.14762881 -0.33514522 0.142036307
T3S1 -1.77915713 -1.57684401 -2.40731062 -0.16246954 -1.13827510 -1.572223128 0.39958868 -0.62003482 -1.23180650 -0.908141905
T3S2 -1.80093781 1.40677342 -2.00653035 1.31217445 2.49078997 -0.331605766 -0.44636779 1.88665314 0.62390115 -1.979127915
T3S3 -0.19515396 0.64121705 -1.42175886 0.94633601 1.18314322 0.356666826 0.35724747 -0.39248176 -1.01111108 0.887210503
T3S4 -0.03337711 -1.22064722 1.41331871 0.55048179 -0.99424565 -0.090116768 -0.12121385 -1.12841112 0.14705172 -0.272828981
PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29
T1S10 0.14553176 0.444887701 0.030555073 -0.10425935 -0.4649432055 0.355881811 0.17992733 0.7760907427 -0.627690690
T1S11 1.38300176 0.979099117 0.211703218 0.15116950 -0.8685434581 1.049405232 -1.10236127 0.0002552213 -0.767570144
T1S1 0.34009906 -0.271639014 -0.903345414 1.30069377 -0.5473422995 -0.048022311 -0.10948686 -0.9587529174 -0.015703705
T1S3 0.86994683 0.246913911 -0.158519509 0.12505746 -1.1016182738 0.059396108 0.01323105 1.2624136272 0.275632635
T1S5 -0.28004625 -0.600135978 -0.225638160 -0.22408151 -0.0009001045 -0.794429743 -0.71040916 -0.7007108564 -0.013510170
T1S6 0.30251073 1.892988844 1.479249040 -2.05668429 0.6892908892 -0.364245835 -0.08183022 -0.8163892316 0.653306565
T1S9 0.60401701 -0.026577052 -0.266982730 -0.05322975 0.3969389259 0.099425676 -0.35147954 0.1775020985 0.456170749
T2S10 -2.33247107 -1.423626311 0.484900870 -1.55699001 -0.7921676214 -0.407380274 0.94742076 0.0174250216 -0.392227697
T2S11 -0.37304090 -0.044363953 -0.212358865 0.03656853 -0.2055835293 0.405011932 -0.12423554 -0.1260813064 0.255872703
T2S1 0.51211253 -0.869077708 0.215385184 -0.23843593 -1.4081013750 1.359543178 0.89307294 -0.3493111760 0.367630027
T2S2 -1.23341832 0.332054911 -0.742253664 0.06444234 0.3595826030 0.578803687 1.26134666 -0.7590524011 -0.330851226
T2S3 1.15014708 0.327327365 -0.825669816 -0.49833503 -0.5864129208 0.557311711 0.31739992 1.1282591094 0.969019040
T2S4 0.65239653 -0.397745489 -0.437515949 -1.40322970 0.5273097762 0.006654982 -1.03730521 1.4003376711 0.686279185
T2S5 -1.53158603 1.099204218 0.981168725 -1.53870366 0.6821822418 0.606535859 -0.09598065 1.0952280391 -0.572618674
T2S6 1.14916912 -0.442956604 -0.031343541 1.53126698 1.1626853043 -2.294630143 1.30790014 -0.4321071978 0.039003479
T2S9 -0.06879790 0.314833969 -0.487910272 0.13571852 -0.0935417744 -0.111592202 0.13245163 0.0115432579 0.137289451
T3S10 -0.31137995 0.003053146 -0.131364000 0.14127153 -0.1147585816 0.378698308 -0.11600723 0.0008098405 -0.054375007
T3S1 -0.18259033 -0.941533760 -0.608857056 -0.32371094 1.3136321089 -0.314823572 1.28871003 0.3636956219 0.116792357
T3S2 -1.90095709 -1.656661333 -0.169026823 1.05756571 -0.7505352043 0.531616513 -1.01059260 -0.4445991478 -0.217363962
T3S3 0.23648113 0.150139986 0.744726257 1.08530156 0.9973389162 -0.297207577 -0.70397225 1.0798450685 -0.540203211
T3S4 -1.40747880 0.950362687 2.174998385 1.84091769 0.5522902416 0.509635655 -0.78582948 0.1257947674 1.529647938
PC30 PC31 PC32 PC33 PC34 PC35 PC36 PC37 PC38
T1S10 -0.61090108 -0.29334392 -0.435556282 0.436122013 0.620570063 0.0752049091 0.364291442 0.222722389 0.853366974
T1S11 0.38683053 -0.47170713 -0.507376193 -0.200978899 -0.466127983 -0.0887001046 0.407399148 0.137577782 -0.114942713
T1S1 0.39068097 -0.29859127 0.422021427 0.016638837 0.130295349 -0.3423059226 0.099331214 0.004371715 -0.053692539
T1S3 -0.89085871 0.30772799 0.035796223 0.384888434 1.245611088 0.8423535329 -0.056811384 0.677715223 0.876394557
T1S5 -0.10448823 -0.34639788 -0.108416705 0.280931290 -0.214231135 0.2227704635 0.234041079 0.027794099 -0.168256122
T1S6 0.17543435 -0.16388918 0.408543638 -0.023459302 -0.021177178 -0.1444244416 0.084570850 0.490213102 -0.165936538
T1S9 -0.06692976 0.30760983 -0.009473703 0.040284195 -0.050039551 0.0235304169 0.049103401 0.134936707 -0.032580927
T2S10 0.45881864 -0.27229696 0.378141816 0.079696113 0.025621644 0.2220459377 -0.229916233 -0.116912242 0.023235021
T2S11 -0.07367150 0.04060642 0.327821556 -0.335943407 0.177567813 -0.2684558422 0.561488607 0.476894504 0.234251029
T2S1 0.22719978 0.38827525 0.282206264 -0.438390883 0.036965892 -0.1441772303 0.141507739 0.073251632 -0.131871600
T2S2 -0.33864908 0.16809097 0.064604371 0.086877729 0.002553673 0.1239108829 0.412985888 0.139172474 0.005935413
T2S3 -2.26654726 0.23001824 0.880133582 -0.079399503 -1.611846710 -0.0529833199 -0.104266043 -0.665332121 -0.874030961
T2S4 0.77112458 0.73081680 -0.074041702 0.112370407 0.295223675 0.4241879490 -0.004228575 0.091313711 -0.312300967
T2S5 -0.17121290 -0.74735265 -0.331455620 -0.613891167 -0.362680848 -0.2437333112 -0.107411964 0.564034865 0.585280991
T2S6 -0.45836363 0.59828788 0.257913281 -0.377057458 -0.112145871 0.2661286081 0.001694725 0.444611086 0.313382173
T2S9 0.04952467 0.28054101 -0.020026648 -0.062809756 0.036918082 0.0670041757 -0.061020750 -0.038346173 0.043463127
T3S10 -0.04433020 0.11256207 0.276748498 0.012290749 0.105555296 -0.3108955253 0.225398056 0.383416618 0.104263911
T3S1 0.78849208 -0.09561134 -1.600916711 1.062468681 -0.458306008 -1.1614572080 0.283387190 -0.270171409 0.076478396
T3S2 -0.40605694 0.55137855 0.461720824 0.721368991 0.554321865 0.4202858684 0.512499024 -0.227527215 -0.125192170
T3S3 0.02013393 -0.20179302 -0.865030680 0.385797223 -0.040604332 1.1497492465 -1.231478661 -0.824389167 0.413980530
T3S4 0.97160611 -0.31764671 0.039652545 0.203106279 0.405656989 -0.0564320934 0.292736573 0.165312522 -1.225094311
PC39 PC40 PC41 PC42 PC43 PC44 PC45 PC46 PC47
T1S10 -0.38601496 -0.522398117 -0.2601828761 1.7425428517 -0.150037940 -0.022197976 -2.831425e-03 -0.0100519518 -2.038300e-16
T1S11 -0.22022127 -0.233355749 -0.0045061584 -0.2088644668 -0.045195931 0.051624214 6.578873e-03 0.0074415753 2.957704e-16
T1S1 -0.21897172 -0.033400842 0.1854388730 0.1027272364 0.079799837 -0.025304143 -5.997196e-03 0.0027479728 -9.393528e-16
T1S3 -0.75497648 0.603862482 0.6040479086 -1.0356859876 0.297725332 -0.016166585 -1.318602e-02 0.0043860796 -1.147520e-15
T1S5 0.26017909 0.053160901 -0.0241782401 -0.0575928258 0.020748744 -0.009410090 9.226674e-05 -0.0042139111 2.766017e-15
T1S6 -0.25843482 0.293010354 0.0953476162 0.1450311162 0.083660353 0.004693246 -1.979926e-02 0.0067988303 3.126839e-15
T1S9 -0.19310548 0.050539539 -0.0263737418 0.0752063777 0.060796962 0.005479383 -4.041996e-03 0.0066798004 5.663872e-16
T2S10 0.30388992 0.100877709 -0.0191138984 0.0412173967 0.026576122 -0.032226350 -3.521885e-03 -0.0070238059 -3.287301e-16
T2S11 0.28580250 -0.321592098 -0.4781464260 -0.4109621304 -1.159042956 -1.198398284 -2.007390e-01 0.0455439782 -1.189153e-15
T2S1 -0.11248568 -0.073344813 -0.0955302997 -0.0376190978 -0.004465990 0.023019713 8.557707e-03 0.0032389114 -1.001803e-15
T2S2 0.01410303 -0.018399653 -0.0723573078 -0.0394075311 0.055382093 -0.053886132 2.126331e-02 -0.0041445615 1.569925e-16
T2S3 0.47491853 -0.324154290 0.1113297600 0.0257086927 0.021247549 -0.026144440 -1.289991e-03 -0.0004957201 -1.286297e-15
T2S4 0.02175802 -0.371533083 -0.0654592950 -0.0335770823 -0.032892383 0.006709334 4.995619e-03 -0.0041659031 3.790371e-16
T2S5 0.28095908 -0.242536910 0.0130502993 -0.3851088178 -0.060000171 0.091145738 9.202016e-03 0.0107147824 7.953707e-16
T2S6 0.17245826 -0.168809317 -0.0903469935 0.0539771988 -0.089777526 0.067356765 4.954736e-03 0.0027736662 2.391316e-15
T2S9 -0.04763513 -0.036926578 0.0312385396 0.0438364840 0.023883432 0.018597761 5.213981e-03 0.0019178143 -2.351418e-15
T3S10 0.34199375 -0.272932684 -0.2206710864 -0.2167177793 -0.243551170 0.556300282 2.557553e-01 -0.8175569629 -5.924081e-16
T3S1 -0.17851876 -0.315320817 0.4560668700 -0.3531386713 -0.169198752 0.016921196 -1.278480e-02 -0.0068107404 -9.254750e-16
T3S2 0.12952809 -0.070809741 0.1503751764 0.0970471976 0.009140306 -0.010012191 -6.877311e-03 -0.0104988319 -3.981190e-16
T3S3 1.09009743 0.242828505 -0.7044769588 -0.1231401097 -0.397445764 -0.026921698 2.386564e-02 -0.0229946682 -2.870967e-16
T3S4 0.33863712 -0.189836596 0.1949792205 0.0225704339 0.229417048 -0.063472056 -6.409916e-03 0.0074190104 1.847481e-16
[ reached getOption("max.print") -- omitted 26 rows ]
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
10: In readChar(file, size, TRUE) : truncating string with embedded nuls
pca_lograt_frame <- data.frame(lograt_pca$x) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pca data table
pca_lograt_frame <- left_join(pca_lograt_frame, metadata, by = "SampleID")
head(pca_lograt_frame)
# Plot PCA with Station
pca_lograt_station <- ggplot(pca_lograt_frame, aes(x = PC1, y = PC2, color = Station)) +
geom_point(aes(shape = Bayside), size = 4) +
ylab(paste0('PC2 ', round(lograt_variances[2,2]*100,2),'%')) + #Extract y axis value from variance
xlab(paste0('PC1 ', round(lograt_variances[1,2]*100,2),'%')) + #Extract x axis value from variance
scale_color_brewer(palette="Paired") +
ggtitle('CLR-Euclidean PCA') +
coord_fixed(ratio = 1) +
theme_bw()
pca_lograt_station
ggsave("figures/pca_clr.eps",pca_lograt_station, width = 7, height = 5, units = c("in"))
The CLR-Euclidean PCA reveals there is some separation according to East vs West. The PCA only explains ~26% of the variance so keep going with different ordinations to see if we can get a better representation
The more traditional approach to ordinations is to do a PCoA on a distance matrix such as Bray-Curtis, Jaccard, or Unifrac. While these are not considered compositional approaches, when combined with pre-treatment (transformations) they become more appropriate. One such transformation that I will use here is the Hellinger transformation.
The different distance matrices also tell you a few different things about the dataset so I will run through this to try to see if I can tease those out.
Before calculating any distance matrix, do a transformation of the filtered count table. Hellinger transformation is the square root of the relative abundance, so calculate it based on the ps_ra object:
ps_hellinger
phyloseq-class experiment-level object
otu_table() OTU Table: [ 368 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 8 sample variables ]
tax_table() Taxonomy Table: [ 368 taxa by 7 taxonomic ranks ]
First, Jaccard, which builds the distance matrix based on presence/absence between samples. It does not take into account relative abundance of the taxa. Therefore this functions well for determining differences driven by rare taxa, which are weighed the same as abundant taxa.
jac_dmat<-vegdist(otu_table(ps_hellinger),method="jaccard") # Jaccard dist metric
pcoa_jac<-pcoa(jac_dmat) # perform PCoA
# Extract variances from pcoa, from jaccard calculated dist. metric
jac_variances <- data.frame(pcoa_jac$values$Relative_eig) %>%
select(PercVar = 'pcoa_jac.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(jac_variances)
# Make a screeplot
ggplot(jac_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Jaccard PCoA Screeplot")
The first two axes (19.0 + 9.6 = 28.6) are OK. But I am going to experiment and plot the first 3 axes since the 2nd and 3rd explain a similar amount of variance, 19.6 and 8.4% respectively
Plot in 3D with Plotly
# Extract variances from the jaccard pcoa
pcoa_jac_df <- data.frame(pcoa_jac$vectors) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_jac_df <- left_join(pcoa_jac_df, metadata, by = "SampleID")
head(pcoa_jac_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(jac_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_jaccard <- plot_ly(pcoa_jac_df, type='scatter3d', mode='markers',
x=~Axis.2,y=~Axis.3,z=~Axis.1,colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='PCoA Jaccard Distance',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
pcoa_jaccard
withr::with_dir('Figures', htmlwidgets::saveWidget(as_widget(pcoa_jaccard), file="pcoa_jaccard.html"))
The Jaccard-PCoA shows separation along axis 2 in East vs West differences.
Next, try a Bray-Curtis distance matrix with PCoA, which builds the distance matrix based on presence/absence between samples and relative abundance differences. This ordination will represent well the differences in samples that are driven by taxa with high relative abundances.
bray_dmat<-vegdist(otu_table(ps_hellinger),method="bray") # Bray-Curtis dist metric
pcoa_bray<-pcoa(bray_dmat) # perform PCoA
# Extract variances from pcoa, from jaccard calculated dist. metric
bray_variances <- data.frame(pcoa_bray$values$Relative_eig) %>%
select(PercVar = 'pcoa_bray.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(bray_variances)
# Make a screeplot
ggplot(bray_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Bray-Curtis PCoA Screeplot")
The first two axes (27.7 + 14.3 = 42%) are pretty good again but I am still going to experiment in the plot with the 3rd axis since it is similar to the second (12.2% variance)
Plot in 3D with Plotly
# Extract variances from the jaccard pcoa
pcoa_bray_df <- data.frame(pcoa_bray$vectors) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_bray_df <- left_join(pcoa_bray_df, metadata, by = "SampleID")
head(pcoa_bray_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(bray_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_bray <- plot_ly(pcoa_bray_df, type='scatter3d', mode='markers',
x=~Axis.2,y=~Axis.3,z=~Axis.1,colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='PCoA Bray-Curtis Distance',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
pcoa_bray
withr::with_dir('Figures', htmlwidgets::saveWidget(as_widget(pcoa_bray), file="pcoa_bray.html"))
These results are similar to Jaccard: the second axis seems driven by differences in East vs West. But there are clearly other things going on here with axies 1 and 3. I think this is a good representation of the data: together the 3 axes explain 54.13% of the variance.
Lastly, try a non-metric dimensional scaling ordination. PCA/PCoA are metric and attempt to rotate axes to fit the distance matrix distribution. An NMDS represents the data in 2-axes, by constraining the distribution of the points. Similar to above, this can be combined with different pre-treatment of the data.
First try the compositional approach, an NMDS on CLR-tranformed data using the Euclidean distances (aka Aitchison distance)
euc_dmat<-dist(clr_asv_table_ps, method = "euclidean") # Build the Aitchison distance matrix
euc_nmds <- metaMDS(euc_dmat, k=2, autotransform=FALSE) # Run the ordination
Run 0 stress 0.2105436
Run 1 stress 0.2128996
Run 2 stress 0.2122615
Run 3 stress 0.2111085
Run 4 stress 0.2130327
Run 5 stress 0.223828
Run 6 stress 0.2127398
Run 7 stress 0.2171268
Run 8 stress 0.2233421
Run 9 stress 0.2390473
Run 10 stress 0.2232293
Run 11 stress 0.2146083
Run 12 stress 0.2343976
Run 13 stress 0.2122711
Run 14 stress 0.2175982
Run 15 stress 0.2273345
Run 16 stress 0.227759
Run 17 stress 0.2231565
Run 18 stress 0.2286921
Run 19 stress 0.2426823
Run 20 stress 0.2110142
... Procrustes: rmse 0.01362479 max resid 0.06588342
*** No convergence -- monoMDS stopping criteria:
1: no. of iterations >= maxit
19: stress ratio > sratmax
euc_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.05 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
[1] 0.2105436
# Extract points from nmds and merge into data frame with metadata
euc_nmds_df <- data.frame(euc_nmds$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
euc_nmds_df <- left_join(euc_nmds_df, metadata, by = "SampleID")
head(euc_nmds_df)
## Plotting euclidean distance NMDS
nmds_aitch <- ggplot(euc_nmds_df,aes(x = MDS1, y = MDS2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Aitchison Distance NMDS, Stress = ', round(euc_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_aitch
ggsave("figures/nmds_aitch.eps",nmds_aitch, width = 7, height = 5, units = c("in"))
The above has a relatively high stress (>0.2) so should be interpreted with caution. But it does show some separation East vs West along NMDS 1.
Next try a Jaccard NMDS, which will represent differences in presence/absence among samples, emphasizing both abundant and rare taxa the same
jac_nmds <- metaMDS(jac_dmat, k=2, autotransform=FALSE) # Run the ordination. Distance matrix was already calculated above
Run 0 stress 0.1627003
Run 1 stress 0.1660244
Run 2 stress 0.1496856
... New best solution
... Procrustes: rmse 0.09100415 max resid 0.3183697
Run 3 stress 0.1495161
... New best solution
... Procrustes: rmse 0.05088368 max resid 0.3258651
Run 4 stress 0.1578622
Run 5 stress 0.1664404
Run 6 stress 0.1572462
Run 7 stress 0.1573408
Run 8 stress 0.1495163
... Procrustes: rmse 0.0009228226 max resid 0.004331749
... Similar to previous best
Run 9 stress 0.1496491
... Procrustes: rmse 0.01267127 max resid 0.07692795
Run 10 stress 0.1709846
Run 11 stress 0.1508712
Run 12 stress 0.1496642
... Procrustes: rmse 0.01285554 max resid 0.07716671
Run 13 stress 0.1498306
... Procrustes: rmse 0.05243516 max resid 0.3271674
Run 14 stress 0.1496489
... Procrustes: rmse 0.01266723 max resid 0.07694581
Run 15 stress 0.149831
... Procrustes: rmse 0.05247949 max resid 0.3275625
Run 16 stress 0.1573579
Run 17 stress 0.1496498
... Procrustes: rmse 0.01268426 max resid 0.07733981
Run 18 stress 0.1635248
Run 19 stress 0.1498309
... Procrustes: rmse 0.05245881 max resid 0.3273925
Run 20 stress 0.1496495
... Procrustes: rmse 0.01267903 max resid 0.07724868
*** Solution reached
jac_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
[1] 0.1495161
# Extract points from nmds and merge into data frame with metadata
jac_nmds_df <- data.frame(jac_nmds$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
jac_nmds_df <- left_join(jac_nmds_df, metadata, by = "SampleID")
head(jac_nmds_df)
## Plotting euclidean distance NMDS
nmds_jaccard <- ggplot(jac_nmds_df,aes(x = MDS1, y = MDS2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Jaccard Distance NMDS, Stress = ', round(jac_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_jaccard
ggsave("figures/nmds_jaccard.eps",nmds_jaccard, width = 7, height = 5, units = c("in"))
This is still a relatively high stress (>0.1) so should be interpreted with caution. Similar to Aitchison-distance nMDS but there is a little more separation of East vs West on NMDS 2 axis.
Next try a Bray-Curis NMDS, which will represent differences in presence/absence among samples and relative abundance, thus emphasizing impacts of highly abundant taxa.
bray_nmds <- metaMDS(bray_dmat, k=2, autotransform=FALSE) # Run the ordination. Distance matrix was already calculated above
Run 0 stress 0.1628608
Run 1 stress 0.1498312
... New best solution
... Procrustes: rmse 0.08567004 max resid 0.321179
Run 2 stress 0.1511991
Run 3 stress 0.1496848
... New best solution
... Procrustes: rmse 0.01159486 max resid 0.06870893
Run 4 stress 0.1496492
... New best solution
... Procrustes: rmse 0.05214599 max resid 0.3270977
Run 5 stress 0.1805538
Run 6 stress 0.1495165
... New best solution
... Procrustes: rmse 0.01281364 max resid 0.07805366
Run 7 stress 0.1511997
Run 8 stress 0.1496853
... Procrustes: rmse 0.05087238 max resid 0.3273574
Run 9 stress 0.166339
Run 10 stress 0.1496489
... Procrustes: rmse 0.01281149 max resid 0.07792418
Run 11 stress 0.1508707
Run 12 stress 0.1568975
Run 13 stress 0.1496846
... Procrustes: rmse 0.05087779 max resid 0.3278401
Run 14 stress 0.1625699
Run 15 stress 0.1496844
... Procrustes: rmse 0.05084287 max resid 0.327484
Run 16 stress 0.1937921
Run 17 stress 0.1710636
Run 18 stress 0.1508709
Run 19 stress 0.1648571
Run 20 stress 0.1627921
*** No convergence -- monoMDS stopping criteria:
20: stress ratio > sratmax
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
10: In readChar(file, size, TRUE) : truncating string with embedded nuls
bray_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
[1] 0.1495165
# Extract points from nmds and merge into data frame with metadata
bray_nmds_df <- data.frame(bray_nmds$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
bray_nmds_df <- left_join(bray_nmds_df, metadata, by = "SampleID")
head(bray_nmds_df)
## Plotting euclidean distance NMDS
nmds_bray <- ggplot(bray_nmds_df,aes(x = MDS1, y = MDS2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Bray-Curtis Distance NMDS, Stress = ', round(bray_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_bray
ggsave("figures/nmds_bray.eps",nmds_bray, width = 7, height = 5, units = c("in"))
Very similar to Jaccard results. High-ish stress (0.15)
–> CONTINUE HERE
XXX